import operator
import numpy as np
from scipy.sparse import csr_matrix, csc_matrix
from ..core import core_modules as _cm

default_mode = _cm.compressed

_dtype_to_tensor = {_cm.bool:    _cm.TensorBool,
                    _cm.float32: _cm.TensorFloat,
                    _cm.float64: _cm.TensorDouble,
                    _cm.int8:    _cm.TensorInt8,
                    _cm.int16:   _cm.TensorInt16,
                    _cm.int32:   _cm.TensorInt32,
                    _cm.int64:   _cm.TensorInt64,
                    _cm.uint8:   _cm.TensorUInt8,
                    _cm.uint16:  _cm.TensorUInt16,
                    _cm.uint32:  _cm.TensorUInt32,
                    _cm.uint64:  _cm.TensorUInt64}

_dtype_error = "Invalid datatype. Must be bool, float32/64, (u)int8, (u)int16, (u)int32 or (u)int64"


class tensor:
    """ A mathematical tensor.

        A tensor object represents a mathematical tensor of arbitrary dimensions and is at the heart of this
        library . A tensor must consist of a homogeneous :class:`~pytaco.dtype` and be stored in a given
        :class:`~pytaco.format`. They can optionally be given a name.

        Taco allows users to compressed certain dimensions of tensors which means only the non-zeros and their index
        information is stored. This is useful for minimizing the size of the data that needs to be stored as well as
        speeding up computation since in some cases, taco can ignore computing 0'd out elements.

        Computations on tensors can be expressed using the :ref:`tfuncs` provided or by forming :ref:`iexpr` where
        tensors are indexed using :class:`index_var` s to specify different computations and reductions along their
        dimensions.

        All operators on the tensor class produce compressed values by default. This can be overridden by using the
        :ref:`tfuncs` methods provided and specifying your own format.

        Parameters
        -------------
            arg1:  int, float, iterable, optional

                If this argument is a python int or float, taco will create a scalar tensor and initialize it to the
                with the value passed in. If arg1 is an iterable, taco will interpret this as the shape and initialize
                a tensor with the given shape. The default value is none meaning that taco will simply create an empty
                scalar tensor and ignore the fmt argument.

            fmt: :class:`~pytaco.format`, optional
                :class:`format` to store the tensor in.

                * A list of :class:`mode_format` s can also be given in which case, the mode ordering is taken to be the
                  same as the list.

                * If a single :class:`mode_format` is given, all the dimensions of the tensor are initialized to that
                  :class:`mode_format`.

                * If not specified then all dimensions default to compressed.

            dtype: :class:`~pytaco.dtype`, optional
                The :class:`dtype` of the values stored by this tensor.

            name: str, optional
                The name of the tensor to be created. If not specified the name will be automatically ignored by taco.
                Most users can ignore this but specifying the name can be useful to those wanting to read the C code
                generated by taco.


        Notes
        -------
        Tensors are iterable and one can iterate over the coordinates, value pairs in a tensor as shown in the example
        section below. The tensor iterator only returns non-zero elements along with their coordinates to the user. An
        example is shown below.

        To read values from a tensor they must be specified using one call to ``__getitem__``. As a result, attempting
        to read an order 2 tensor ``A`` using the syntax ``A[0][0]`` is illegal in taco. The correct way to read A would
        be to write ``A[0, 0]``. This holds in general tensors of all dimensions.

        Inserting is done by an explicit :func:`tensor.insert` method instead of ``__setitem__``. This is because the
        current insert semantics actually increment the value in the current location.


        Methods
        --------
        transpose
        pack
        compile
        assemble
        compute
        evaluate
        to_dense
        to_array
        toarray
        to_sp_csr
        to_sp_csc
        copy
        insert
        remove_explicit_zeros

        Attributes
        -----------
        order
        name
        shape
        format
        dtype
        T

        Examples
        ------------
        Create a scalar tensor with the value 42.

        >>> import pytaco as pt
        >>> t = pt.tensor(42)

        Create an order 2 tensor, insert values then iterate over the non-zeros

        >>> mat = pt.tensor([2, 2], pt.csr)
        >>> mat.insert([1,1], 10)
        >>> mat.insert([0,0], 100)
        >>> for coord, value in mat:
        ...     print(coord, value)
        [0, 0] 100.0
        [1, 1] 10.0


    """

    def __init__(self, arg1=None, fmt=_cm.compressed, dtype=_cm.float32, name=None):

        if name is None:
            name = _cm.unique_name('A')

        if isinstance(arg1, int) or isinstance(arg1, float) or isinstance(arg1, bool) or not arg1:
            init_func = _dtype_to_tensor.get(dtype)
            if init_func is None:
                raise ValueError(_dtype_error)
            self._tensor = init_func(name)

            if arg1 is not None:
                self._tensor.insert([], arg1 if arg1 else 0)
                self._tensor.pack()

        elif isinstance(arg1, tuple) or isinstance(arg1, list):
            shape = arg1
            init_func = _dtype_to_tensor.get(dtype)
            if init_func is None:
                raise ValueError(_dtype_error)
            self._tensor = init_func(name, shape, fmt)
        else:
            raise ValueError("Invalid argument for first argument. Must be a tuple or list if a shape or a single value"
                             "if initializing a scalar.")

    @classmethod
    def _fromCppTensor(cls, cppTensor):
        pytensor = cls()
        pytensor._tensor = cppTensor
        return pytensor

    @classmethod
    def _from_x(cls, x, dtype):
        init_func = _dtype_to_tensor.get(dtype)
        if init_func is None:
            raise ValueError(_dtype_error)
        return cls._fromCppTensor(init_func(x))

    @classmethod
    def from_tensor_base(cls, tensor_base):
        return cls._from_x(tensor_base, tensor_base.dtype())

    @property
    def order(self):
        """
            Returns an int which is the order (number of dimensions) of the tensor.
        """
        return self._tensor.order()

    @property
    def name(self):
        """
            Returns a string representing the name of the tensor.
        """
        return self._tensor.get_name()

    @name.setter
    def name(self, name):
        self._tensor.set_name(name)

    @property
    def shape(self):
        """
            Returns the shape of the tensor. Shape[i] corresponds to the size of dimension i.
        """
        return self._tensor.get_dimensions()

    @property
    def dtype(self):
        """
            Returns the :class:`dtype` used to store the tensor.
        """
        return self._tensor.dtype()

    @property
    def format(self):
        """
            Returns the :class:`format` used to store the tensor.
        """
        return self._tensor.format()

    @property
    def T(self):
        """
            This is equivalent to :func:`~pytaco.tensor.transpose` with arguments ``shape[::-1]``.
        """
        if self.order < 2:
            return self
        new_ordering = list(range(self.order))[::-1]
        return self.transpose(new_ordering)

    def transpose(self, new_ordering, new_format=None, name=None):
        """
            Transposes a tensor.

            Returns a new tensor with the shape reordered by the new_ordering list passed in.

            Parameters
            ------------
            new_ordering: iter
                The ordering for the new tensor shape. The shape of the new_tensor is given by self.shape permuted
                by new_ordering.

            new_format: :class:`format`, optional
                The format of the new tensor. This follows the same rules for tensor initialization. If the format is
                not specified the tensor will have the same format as this tensor.

            name: str, optional
                The name of the output tensor. The same as in the tensor constructor.

            Notes
            ------
            This function always returns a new tensor regardless of if it is dense.

            Returns
            ---------
            t_tensor: tensor
                A tensor whose dimensions have been transposed by new_ordering with :class:`format` new_format if
                specified and name name if specified.

        """
        if name is None:
            name = _cm.unique_name('A')

        if new_format is None:
            new_format = self.format

        new_t = self._tensor.transpose(new_ordering, new_format, name)

        return tensor._fromCppTensor(new_t)

    def pack(self):
        """
            Packs a tensor.

            When performing inserts, taco will place new values in a buffer to avoid recreating sparse structures on
            every insert. As a result, data is not immediately updated to the tensor. Taco will automatically do this
            when needed (for example before printing, computing, writing to a file, iterating etc..) but we expose this
            mechanism to allow the user to explicitly pack a tensor themselves.

            It is advised to explicitly pack tensors before taking timing measurements since this is not considered to
            be part of the kernel computation.

            Examples
            -----------

            >>> import pytaco as pt
            >>> a = pt.tensor([2, 2])
            >>> a.insert([0, 0], 10) # 10 is in a buffer
            >>> a.pack() # 10 is now in matrix structure.

        """
        self._tensor.pack()

    def compile(self):
        """
            Compiles current expression.

            :class:`index_expression` are assigned to taco tensors. These expressions eventually need to get compiled
            by taco. The compile directive allows users to explicitly compile am efficient C kernel to perform the
            computation described by the :class:`index_expression`. Again, this is done implicitly by taco when needed
            but users will need to use this manually to obtain accurate timing measurements.
        """
        self._tensor.compile()

    def assemble(self):
        """
            Assemble the indices and values in the specified sparse structures.
        """
        self._tensor.assemble()

    def evaluate(self):
        """
            Compile, assemble and compute as needed.
        """
        self._tensor.evaluate()

    def compute(self):
        """
            Compute the given expression and put the values in the tensor storage.
        """
        self._tensor.compute()

    def __iter__(self):
        return iter(self._tensor)

    def __getitem__(self, index):
        return self._tensor[index]

    def __setitem__(self, key, value):
        self._tensor[key] = value

    def __repr__(self):
        return self._tensor.__repr__()

    def __add__(self, other):
        return tensor_add(self, other, default_mode)

    def __radd__(self, other):
        return tensor_add(other, self, default_mode)

    def __sub__(self, other):
        return tensor_sub(self, other, default_mode)

    def __rsub__(self, other):
        return tensor_sub(other, self, default_mode)

    def __mul__(self, other):
        return tensor_mul(self, other, default_mode)

    def __rmul__(self, other):
        return tensor_mul(other, self, default_mode)

    def __truediv__(self, other):
        return tensor_div(self, other, default_mode)

    def __rtruediv__(self, other):
        return tensor_div(other, self, default_mode)

    def __floordiv__(self, other):
        return tensor_floordiv(self, other, default_mode)

    def __rfloordiv__(self, other):
        return tensor_floordiv(other, self, default_mode)

    def __ge__(self, other):
        return tensor_ge(self, other, default_mode)

    def __gt__(self, other):
        return tensor_gt(self, other, default_mode)

    def __le__(self, other):
        return tensor_le(self, other, default_mode)

    def __lt__(self, other):
        return tensor_lt(self, other, default_mode)

    def __ne__(self, other):
        return tensor_ne(self, other, default_mode)

    def __eq__(self, other):
        return tensor_eq(self, other, default_mode)

    def __pow__(self, power, modulo=None):
        return tensor_pow(self, power, default_mode)

    def __abs__(self):
        return tensor_abs(self, self.format)

    def __neg__(self):
        return tensor_neg(self, self.format)

    def __array__(self):
        if not _cm.is_dense(self.format):
            raise ValueError("Cannot export a compressed tensor. Make sure all dimensions are dense "
                             "using to_dense() before attempting this conversion.")
        a = np.array(self._tensor, copy=False)
        a.setflags(write=False)  # forbid user from changing array via numpy if they request a copy.
        return a

    def to_dense(self):
        """
            Converts a tensor of any format to a dense tensor.
        """
        new_t = tensor(self.shape, _cm.dense, dtype=self.dtype)
        idx_vars = _cm.get_index_vars(self.order)
        new_t[idx_vars] = self[idx_vars]
        return new_t

    def to_array(self):
        """
            Same as :func:`to_array`.
        """
        return to_array(self)

    def toarray(self):
        """
            Alias for :func:`to_array` for compatibility with scipy.
        """
        return self.to_array()

    def to_sp_csr(self):
        """
            Same as :func:`to_sp_csr`.
        """
        return to_sp_csr(self)

    def to_sp_csc(self):
        """
            Same as :func:`to_sp_csc`.
        """
        return to_sp_csc(self)

    def copy(self):
        """
            Returns a deep copy of a tensor.
        """
        new_t = tensor(self.shape, self.format, dtype=self.dtype)
        idx_vars = _cm.get_index_vars(self.order)
        new_t[idx_vars] = self[idx_vars]
        return new_t

    def insert(self, coords, val):
        """
            Increments the value at a given coordinate (:func:`insert`).

            Parameters
            -----------
            coords: iter of ints
                The coordinate of the tensor we want to assign to val

            val: dtype
                The value to assign to the given coordinate

            Warnings
            ----------
            This function INCREMENTS the current value at coords.

            Examples
            ----------
            >>> import pytaco as pt
            >>> t = pt.tensor([2, 2])
            >>> t.insert([0, 0], 10)
            >>> t[0, 0]
            10.0
            >>> t.insert([0, 0], 100)
            >>> t[0, 0]
            110.0

            Notes
            ------
            Setitem was purposely not overridden to not confuse users with the semantics. This function will increment
            the value at a given coordinate. We anticipate that data will be read into taco using the variety of
            functions provided in :ref:`io`.

            This will be changed in a future release.
        """
        self._tensor.insert(coords, val)

    def remove_explicit_zeros(self, new_fmt=None, new_dtype=None):
        """
            Same as :func:`remove_explicit_zeros`.
        """
        return remove_explicit_zeros(self._tensor, new_fmt, new_dtype)


def remove_explicit_zeros(t, new_fmt=None, new_dtype=None):
    """
        Creates a tensor without explicit zeros.

        Parameters
        ------------
        t: tensor, array_like
            The object to convert to a tensor without explicit zeros.

        new_fmt: :class:`format`, optional
            The format to use for the storage of the new tensor. If not specified, the format of t is used.

        new_dtype: :class:`dtype`, optional
            The dtype to use for the new tensor. If not specified, the dtype of t is used.

        Notes
        -------
        If new_fmt is dense, then this function is forced to keep explicit zeros in the tensor in order for the data
        layout to be correct.

        Returns
        ---------
        new_t: tensor
            A new tensor with all of its explicit zeros removed if new_fmt is not dense.



    """
    t = as_tensor(t, copy=False)

    if new_dtype is None:
      new_dtype = t.dtype
    if new_dtype != t.dtype:
      t = as_type(t, new_dtype)

    new_fmt = t.format if new_fmt is None else new_fmt

    return tensor._fromCppTensor(t._tensor.remove_explicit_zeros(new_fmt))


def _from_matrix(inp_mat, copy, csr):
    matrix = inp_mat
    if not inp_mat.has_sorted_indices:
        matrix = inp_mat.sorted_indices()

    indptr, indices, data = matrix.indptr, matrix.indices, matrix.data
    shape = matrix.shape
    return tensor._fromCppTensor(_cm.fromSpMatrix(indptr, indices, data, shape, copy, csr))


def from_sp_csr(matrix, copy=True):
    """
    Convert a sparse scipy matrix to a CSR taco tensor.

    Initializes a taco tensor from a scipy.sparse.csr_matrix object. This function copies the data by default.

    Parameters
    -----------
    matrix: scipy.sparse.csr_matrix
        A sparse scipy matrix to use to initialize the tensor.

    copy: boolean, optional
        If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.

    Notes
    --------
    The copy flag is ignored if the GPU backend is enabled.
    (This restriction will be lifted in future versions of taco.)

    Returns
    --------
    t: tensor
        A taco tensor pointing to the same underlying data as the scipy matrix if copy was set to False. Otherwise,
        returns a taco tensor containing data copied from the scipy matrix.
    """
    return _from_matrix(matrix, copy, True)  # true if csr false otherwise


def from_sp_csc(matrix, copy=True):
    """
    Convert a sparse scipy matrix to a CSC taco tensor.

    Initializes a taco tensor from a scipy.sparse.csc_matrix object. This function copies the data by default.

    Parameters
    -----------
    matrix: scipy.sparse.csc_matrix
        A sparse scipy matrix to use to initialize the tensor.

    copy: boolean, optional
        If true, taco copies the data from scipy and stores it. Otherwise, taco points to the same data as scipy.

    Notes
    --------
    The copy flag is ignored if the GPU backend is enabled.
    (This restriction will be lifted in future versions of taco.)

    Returns
    --------
    t: tensor
        A taco tensor pointing to the same underlying data as the scipy matrix if copy was set to False. Otherwise,
        returns a taco tensor containing data copied from the scipy matrix.
    """
    return _from_matrix(matrix, copy, False)


def from_array(array, copy=True):

    """Convert a numpy array to a tensor.

    Initializes a taco tensor from a numpy array and copies the array by default. This always creates a dense
    tensor.

    Parameters
    ------------
    array: numpy.array
        A numpy array to convert to a taco tensor

    copy: boolean, optional
        If true, taco copies the data from numpy and stores its own copy. If false, taco points to the same
        underlying data as the numpy array.

    Warnings
    ---------
    Taco's changes to tensors may NOT be visible to numpy since taco places inserts in buffers may copy tensor data
    after inserting. See notes for details.

    Notes
    --------
    The copy flag is ignored if the input array is not C contiguous or F contiguous (so for most transposed views).
    If taco detects an array that is not contiguous, it will always copy the numpy array into a C contiguous format.
    Additionally, if the GPU backend is enabled, taco will always copy the numpy array to CUDA unified memory.
    These restriction will be lifted in future versions of taco.

    Taco is mainly intended to operate on sparse tensors. As a result, it buffers inserts since inserting into sparse
    structures is very costly. This means that when the full tensor structure is needed, taco will copy the tensor to
    another location and insert the new values as needed. This saves a lot of time when dealing with sparse structures
    but is not needed for dense tensors (like numpy arrays). Currently, taco does this copy for dense and sparse tensors.
    As a result, after inserting into a taco tensor numpy will not see the changes since taco will not be writing to
    the same memory location that numpy is referencing.


    See also
    ----------
    :func:`from_sp_csc`, :func:`from_sp_csr`

    Examples
    ----------
    If we choose not to copy, modifying the tensor also modifies the numpy array and vice-versa. An example of this is
    shown:

    .. doctest::

        >>> import numpy as np
        >>> import pytaco as pt
        >>> arr = np.array([0, 1, 2, 3]) # Note that this is contiguous so copy possible
        >>> t = pt.from_array(arr, copy=False)
        >>> arr[0] = 23
        >>> t[0]
        23



    Returns
    --------
    t: tensor
        A taco tensor pointing to the same underlying data as the numpy array if copy was set to False. Otherwise,
        returns a taco tensor containing data copied from the numpy array.
    """


    # For some reason disabling conversion in pybind11 still copies C and F style arrays unnecessarily.
    # Disabling the force convert parameter also seems to not work. This explicity calls the different functions
    # to get this working for now
    col_major = array.flags["F_CONTIGUOUS"]

    # The array copying is done implicit by pybind if necessary. If arrays are not contiguous, they will be copied
    # to contiguous c_style or f_style memory layouts before being consumed by the fromNp* function
    t = _cm.fromNpF(array, copy) if col_major else _cm.fromNpC(array, copy)
    return tensor._fromCppTensor(t)


def to_array(t):
    """
    Converts a taco tensor to a numpy array.

    This always copies the tensor. To avoid the copy for dense tensors, see the notes section.

    Parameters
    -----------
    t: tensor
        A taco tensor to convert to a numpy array.

    Notes
    -------
    Dense tensors export python's buffer interface. As a result, they can be converted to numpy arrays using
    ``np.array(tensor, copy=False)`` . Attempting to do this for sparse tensors throws an error. Note that as a result
    of exporting the buffer interface dense tensors can also be converted to eigen or any other library supporting this
    inferface.

    Also it is very important to note that if requesting a numpy view of data owned by taco, taco will mark the array as
    read only meaning the user cannot write to that data without using the taco reference. This is needed to avoid
    raising issues with taco's delayed execution mechanism.

    Examples
    ----------
    We first look at a simple use of to_array

    >>> import pytaco as pt
    >>> t = pt.tensor([2, 2], [pt.dense, pt.compressed])
    >>> t.insert([0, 0], 10)
    >>> t.to_array()[0, 0]
    10.0


    One could choose to use np.array if a copy is not needed


    >>> import pytaco as pt
    >>> import numpy as np
    >>> t = pt.tensor([2, 2], pt.dense)
    >>> t.insert([0, 0], 10)
    >>> a = np.array(t, copy=False)
    >>> a
    array([[10.,  0.],
           [ 0.,  0.]], dtype=float32)
    >>> t.insert([0, 0], 100) # Note that insert increments instead of setting!
    >>> t.to_array()[0, 0]
    110.0


    Returns
    ---------
    arr: numpy.array
        A numpy array containing a copy of the data in the tensor object t.

    """
    return np.array(t.to_dense(), copy=True)


def to_sp_csr(t):
    """

    Converts a taco tensor to a scipy csr_matrix.

    Takes a matrix from taco in any format and converts the matrix to a scipy sparse csr matrix. This method removes
    explicit zeros from the original taco tensor during the conversion.

    Parameters
    -----------
    t: tensor
        A taco tensor to convert to a scipy.csr_matrix array. The tensor must be of order 2 (i.e it must be a matrix).
        If the order of the tensor is not equal to 2, a value error is thrown.


    Notes
    -------
    The data and index values are always copied when making the scipy sparse array


    Returns
    ---------
    matrix: scipy.sparse.csr_matrix
        A matrix containing a copy of the data from the original order 2 tensor t.

    """
    arrs = _cm.to_sp_matrix(t._tensor, True)
    return csr_matrix((arrs[2], arrs[1], arrs[0]), shape=t.shape)


def to_sp_csc(t):
    """

    Converts a taco tensor to a scipy csc_matrix.

    Takes a matrix from taco in any format and converts the matrix to a scipy sparse csc matrix. This method removes
    explicit zeros from the original taco tensor during the conversion.

    Parameters
    -----------
    t: tensor
        A taco tensor to convert to a scipy.csc_matrix array. The tensor must be of order 2 (i.e it must be a matrix).
        If the order of the tensor is not equal to 2, a value error is thrown.


    Notes
    -------
    The data and index values are always copied when making the scipy sparse array


    Returns
    ---------
    matrix: scipy.sparse.csc_matrix
        A matrix containing a copy of the data from the original order 2 tensor t.

"""
    arrs = _cm.to_sp_matrix(t._tensor, False)
    return csc_matrix((arrs[2], arrs[1], arrs[0]), shape=t.shape)


def as_tensor(obj, copy=True):
    """
        Converts array_like or scipy csr and csr to tensors.

        Converts an array_like object (list of lists, etc..) or scipy csr and csc matrices to a taco tensor.

        Parameters
        ------------
        obj: array_like, scipy.sparse.csr_matrix, scipy.sparse.csc_matrix, tensor
            The object to convert to a taco tensor. If the object is a tensor, it will be copied depending on the copy
            flag.

        copy: boolean. optional
            If true, taco will attempt to take a reference to the input if possible. If false, taco will always
            copy the input.

        Notes
        ------
        This method internally uses :func:`from_array`, :func:`from_sp_csr` and :func:`from_sp_csc`. As a result the restrictions
        to those methods and their copy parameters apply here. For instance, non-contiguous arrays will always be copied
        regardless of the copy flag.

        Python objects will also always be copied since this internally uses np.array to create an array_like
        object.

        Returns
        ----------
        t: tensor
            A tensor initialized with the data from the object passed in.

    """

    if isinstance(obj, tensor):
        return obj.copy() if copy else obj

    if isinstance(obj, int) or isinstance(obj, float):
        return tensor(obj)

    if isinstance(obj, np.ndarray):
        return from_array(obj, copy)

    if isinstance(obj, csc_matrix):
        return from_sp_csc(obj, copy)

    if isinstance(obj, csr_matrix):
        return from_sp_csr(obj, copy)

    # Try converting object to numpy array. This will ignore the copy flag
    arr = np.array(obj)
    return from_array(arr, True)


def _is_broadcastable(shape1, shape2):
    for a, b in zip(shape1[::-1], shape2[::-1]):
        if a != b:  # for singleton dimension we would need && a != 1 and b != 1 but this isn't current supported
            return False
    return True


def _compute_elt_wise_out_shape(shape1, shape2):
    if not _is_broadcastable(shape1, shape2):
        raise ValueError("Shapes {} and {} cannot be broadcasted together".format(shape1, shape2))

    return shape1 if len(shape1) >= len(shape2) else shape2


def _get_indices_for_operands(result_indices, order1, order2):
    # This returns a tuple of the index variables that should be used from
    # result_indices to access shapeA and shapeB
    start_a = len(result_indices) - order1
    start_b = len(result_indices) - order2
    return result_indices[start_a:], result_indices[start_b:]


def _compute_bin_elt_wise_op(op, t1, t2, out_format, dtype=None):

    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)
    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype
    out_shape = _compute_elt_wise_out_shape(t1.shape, t2.shape)

    if out_shape:
        result = tensor(out_shape, out_format, dtype=out_dtype)
        index_var_list = _cm.get_index_vars(len(out_shape))
        index_var1, index_var2 = _get_indices_for_operands(index_var_list, t1.order, t2.order)
        result[index_var_list] = op(t1[index_var1], t2[index_var2])
        return result
    else:
        result = tensor(dtype=out_dtype)
        result[None] = op(t1[None], t2[None])
        return result


def tensor_add(t1, t2, out_format, dtype=None):
    """
        Computes the element wise addition of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.


        The ``__add__`` method in the tensor class is implemented using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        sum: tensor
           The element wise addition of the input tensors broadcasted as required.

    """
    return _compute_bin_elt_wise_op(operator.add, t1, t2, out_format, dtype)


def tensor_mul(t1, t2, out_format, dtype=None):
    """
        Computes the element wise multiplication of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.


        The ``__mul__`` method in the tensor class is implemented using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        product: tensor
           The element wise product of the input tensors broadcasted as required.

    """
    return _compute_bin_elt_wise_op(operator.mul, t1, t2, out_format, dtype)


def tensor_sub(t1, t2, out_format, dtype=None):
    """
        Computes the element wise subtraction of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.


        The ``__sub__`` method in the tensor class is implemented using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        difference: tensor
           The element wise difference of the input tensors broadcasted as required.
    """
    return _compute_bin_elt_wise_op(operator.sub, t1, t2, out_format, dtype)


def tensor_div(t1, t2, out_format, dtype=None):
    """
        Computes the element wise division of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The ``__truediv__`` method in the tensor class is implemented using this method.

        Warnings
        ----------
        This performs division on all elements in a tensor including the implicit zeros. Division by 0 produces
        undefined behavior.


        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        quotient: tensor
           The element wise division of the input tensors broadcasted as required.
    """
    return _compute_bin_elt_wise_op(operator.truediv, t1, t2, out_format, dtype)


def tensor_floordiv(t1, t2, out_format, dtype=_cm.int64):
    """
        Computes the element wise floor division of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The ``__floordiv__`` method in the tensor class is implemented using this method.

        Warnings
        ----------
        This performs division on all elements in a tensor including the implicit zeros. Division by 0 produces
        undefined behavior.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        quotient: tensor
           The element wise floor division of the input tensors broadcasted as required. The result is always an
           integer tensor.
    """
    if not dtype.is_int() or not dtype.is_uint():
        raise ValueError("Floor divide must have int data type as output")
    return _compute_bin_elt_wise_op(operator.floordiv, t1, t2, out_format, dtype)


def tensor_gt(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 > t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__gt__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        gt: tensor
           A true value everywhere t1 > t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.gt, t1, t2, out_format, dtype)


def tensor_ge(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 >= t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__ge__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        ge: tensor
           A true value everywhere t1 >= t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.ge, t1, t2, out_format, dtype)


def tensor_lt(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 < t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__lt__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        lt: tensor
           A true value everywhere t1 < t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.lt, t1, t2, out_format, dtype)


def tensor_le(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 <= t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__le__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        leq: tensor
           A true value everywhere t1 <= t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.le, t1, t2, out_format, dtype)


def tensor_ne(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 != t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__ne__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        neq: tensor
           A true value everywhere t1 != t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.ne, t1, t2, out_format, dtype)


def tensor_eq(t1, t2, out_format, dtype=_cm.bool):
    """
        Computes t1 == t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__eq__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        eq: tensor
           A true value everywhere t1 == t2 and a false value in all other locations.
    """
    return _compute_bin_elt_wise_op(operator.eq, t1, t2, out_format, dtype)


def tensor_pow(t1, t2, out_format, dtype=None):
    """
        Computes the t1**t2 element-wise.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        The tensor class implements ``__pow__`` using this method.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        power: tensor
           The element wise power of the input tensors broadcasted as required.
    """
    return _compute_bin_elt_wise_op(operator.pow, t1, t2, out_format, dtype)


def tensor_max(t1, t2, out_format, dtype=None):
    """
        Computes the element wise maximum of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Returns
        ---------
        max: tensor
           The element wise maximum of the input tensors broadcasted as required.
    """

    return _compute_bin_elt_wise_op(_cm.max, t1, t2, out_format, dtype)


def tensor_min(t1, t2, out_format, dtype=None):
    """
        Computes the element wise minimum of two tensors.

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.


        Returns
        ---------
        min: tensor
           The element wise minimum of the input tensors broadcasted as required.
    """
    return _compute_bin_elt_wise_op(_cm.min, t1, t2, out_format, dtype)


def tensor_heaviside(t1, t2, out_format, dtype=None):
    """
        Computes the element wise heaviside step function of two tensors.

        This is defined as:

        * 0 if t1 < 0

        * t2 if t1 == 0

        * 1 if t1 > 0

        Also note:

        * If the two tensors are equal order, performs the operation element-wise

        * If the two tensors have order N and M and N > M, requires the last M dimensions of the tensor with
          order N be equal to the dimensions of the tensor with order M in order to broadcast.

        Parameters
        -----------
        t1, t2: tensors, array_like
            tensors or array_like input operands.

        out_format: format, mode_format
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype, optional
            The datatype of the output tensor.

        Notes
        --------
        The inner dimensions of the input tensor is broadcasted along the dimensions of whichever tensor has a higher
        order.

        Examples
        ---------
        >>> import pytaco as pt
        >>> pt.tensor_heaviside([0, 0.5, 6], [0.5, 10, 10], pt.dense).to_array()
        array([0.5, 1. , 1. ])

        Returns
        ---------
        heaviside: tensor
           The element wise heaviside step function of the input tensors broadcasted as required.
    """

    return _compute_bin_elt_wise_op(_cm.heaviside, t1, t2, out_format, dtype)


def _compute_unary_elt_eise_op(op, t1, out_format, dtype=None):

    t1 = as_tensor(t1, False)
    out_dtype = t1.dtype if dtype is None else dtype
    out_shape = t1.shape

    if out_shape:
        result = tensor(out_shape, out_format, dtype=out_dtype)
        index_var_list = _cm.get_index_vars(t1.order)
        result[index_var_list] = op(t1[index_var_list])
        return result
    else:
        result = tensor(dtype=out_dtype)
        result[None] = op(t1[None])
        return result


def tensor_logical_not(t1, out_format, dtype=None):
    """
        Calculates the element wise logical not of the input tensor.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        not_t1: tensor
            The element wise logical not of the input tensor.

    """
    return _compute_unary_elt_eise_op(_cm.logical_not, t1, out_format, dtype)

def tensor_neg(t1, out_format, dtype=None):
    """
        Negates every value in the tensor.

        The tensor class implements ``__neg__`` using this method.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.


        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_neg([1, -2, 0], out_format=pt.dense).toarray()
        array([-1, 2, 0], dtype=int64)

        Returns
        --------
        neg: tensor
            The element wise negation of the input tensor.

    """
    return _compute_unary_elt_eise_op(_cm.neg, t1, out_format, dtype)

def tensor_abs(t1, out_format, dtype=None):
    """
        Calculates the element wise absolute value on the input tensor.

        The tensor class implements ``__abs__`` using this method.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.


        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_abs([1, -2], out_format=pt.dense).toarray()
        array([1, 2], dtype=int64)

        Returns
        --------
        abs: tensor
            The element wise absolute value of the input tensor.

    """
    return _compute_unary_elt_eise_op(_cm.abs, t1, out_format, dtype)


def tensor_square(t1, out_format, dtype=None):
    """
        Squares each input in the tensor.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_square([1, -2, 7], out_format=pt.dense).toarray()
        array([ 1,  4, 49], dtype=int64)

        Returns
        --------
        square: tensor
            The element wise square of the input tensor.

    """
    return _compute_unary_elt_eise_op(_cm.square, t1, out_format, dtype)


def tensor_cube(t1, out_format, dtype=None):
    """
        Cubes each input in the tensor.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_cube([1, -2, 7], out_format=pt.dense).toarray()
        array([  1,  -8, 343], dtype=int64)

        Returns
        --------
        cube: tensor
            The element wise cube of the input tensor.

    """
    return _compute_unary_elt_eise_op(_cm.cube, t1, out_format, dtype)


def tensor_sqrt(t1, out_format, dtype=None):
    """
        Takes the square root of each input in the tensor.

        Values must be >= 0.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_sqrt([4, 16, 9], out_format=pt.dense, dtype=pt.float32).toarray()
        array([2., 4., 3.], dtype=float32)

        Returns
        --------
        sqrt: tensor
            The element wise sqrt of the input tensor.

    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.sqrt(_cm.cast(x, cast_val))

    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_cube_root(t1, out_format, dtype=None):
    """
        Takes the cube root of each input in the tensor.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_cube_root([4, -16, 9], out_format=pt.dense, dtype=pt.float32).toarray()
        array([ 1.587401 , -2.5198421,  2.0800838], dtype=float32)

        Returns
        --------
        cbrt: tensor
            The element wise cbrt of the input tensor.

    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.cube_root(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_exp(t1, out_format, dtype=None):
    """
        Takes the exp of each input in the tensor.

        Note that this is applied to all elements in the tensor not just non-zeros. It is highly advised to store
        the output of this function in a dense format.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        Note in the example below, the function is performed on the implicit 0s even after they are removed.

        >>> import pytaco as pt
        >>> t = pt.remove_explicit_zeros([4, -16, 0, 0], pt.compressed)
        >>> pt.tensor_exp(t, out_format=pt.compressed, dtype=pt.int32).to_array()
        array([54,  0,  1,  1], dtype=int32)

        Returns
        --------
        exp: tensor
            The element wise exp of the input tensor.

    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.exp(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_log(t1, out_format, dtype=None):
    """
        Takes the natural log of each input in the tensor.

        Note that this is applied to all elements in the tensor not just non-zeros.

        Warnings
        ---------
        The log of 0 is undefined and is performed on every element in the tensor regardless of sparsity.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_log([4, 16], out_format=pt.compressed, dtype=pt.float32).to_array()
        array([1.3862944, 2.7725887], dtype=float32)


        Returns
        --------
        natural log: tensor
            The element wise natural log of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.log(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_log10(t1, out_format, dtype=None):
    """
        Takes the log base 10 of each input in the tensor.

        Note that this is applied to all elements in the tensor not just non-zeros.

        Warnings
        ---------
        The log10 of 0 is undefined and is performed on every element in the tensor regardless of sparsity.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_log10([10, 100], out_format=pt.compressed, dtype=pt.float32).to_array()
        array([1., 2.], dtype=float32)


        Returns
        --------
        log10: tensor
            The element wise log10 of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.log10(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_sin(t1, out_format, dtype=None):
    """
        Takes the sin of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_sin([1, 3.14], out_format=pt.compressed, dtype=pt.float32).to_array()
        array([0.84147096, 0.00159265], dtype=float32)


        Returns
        --------
        sin: tensor
            The element wise sin of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.sin(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_cos(t1, out_format, dtype=None):
    """
        Takes the cosine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_cos([1, 3.14], out_format=pt.compressed, dtype=pt.float32).to_array()
        array([ 0.5403023 , -0.99999875], dtype=float32)


        Returns
        --------
        cosine: tensor
            The element wise cosine of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.cos(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_tan(t1, out_format, dtype=None):
    """
        Takes the tan of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        >>> import pytaco as pt
        >>> pt.tensor_tan([1, 3.14], out_format=pt.compressed, dtype=pt.float32).to_array()
        array([ 1.5574077 , -0.00159265], dtype=float32)


        Returns
        --------
        tan: tensor
            The element wise tan of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.tan(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_asin(t1, out_format, dtype=None):
    """
        Takes the arcsine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        arcsine: tensor
            The element wise arc sine of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.asin(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_acos(t1, out_format, dtype=None):
    """
        Takes the arccosine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        arccosine: tensor
            The element wise arccosine of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.acos(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_atan(t1, out_format, dtype=None):
    """
        Takes the arctan of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        arctan: tensor
            The element wise arctan of the input tensor.
    """

    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.atan(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_atan2(t1, out_format, dtype=None):
    """
        Takes the `atan2 <https://en.wikipedia.org/wiki/Atan2>`_ of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        arctan2: tensor
            The element wise arctan2 of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.atan2(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_sinh(t1, out_format, dtype=None):
    """
        Takes the hyperbolic sine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        hyperbolic_sine: tensor
            The element wise hyperbolic sine of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.sinh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_cosh(t1, out_format, dtype=None):
    """
        Takes the hyperbolic cosine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        hyperbolic_cosine: tensor
            The element wise hyperbolic cosine of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.cosh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_tanh(t1, out_format, dtype=None):
    """
        Takes the hyperbolic tangent of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        hyperbolic_tangent: tensor
            The element wise hyperbolic tangent of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.tanh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_asinh(t1, out_format, dtype=None):
    """
        Takes the inverse hyperbolic sine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        inverse_hyperbolic_sine: tensor
            The element wise inverse hyperbolic sine of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.asinh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_acosh(t1, out_format, dtype=None):
    """
        Takes the inverse hyperbolic cosine of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        inverse_hyperbolic_cosine: tensor
            The element wise inverse hyperbolic cosine of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.acosh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def tensor_atanh(t1, out_format, dtype=None):
    """
        Takes the inverse hyperbolic tangent of each input in the tensor.

        This is applied to all elements in the tensor not just non-zeros.

        Parameters
        ------------
        t1: tensor, array_like
            input tensor or array_like object

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Returns
        --------
        inverse_hyperbolic_tangent: tensor
            The element wise inverse hyperbolic tangent of the input tensor.
    """
    t1 = as_tensor(t1, copy=False)
    cast_val = _cm.max_type(_cm.float32, t1.dtype)
    f = lambda x: _cm.atanh(_cm.cast(x, cast_val))
    return _compute_unary_elt_eise_op(f, t1, out_format, dtype)


def _remove_elts_at_index(inp, elts_to_remove):
    result = inp[:]
    for elt in elts_to_remove:
        if elt >= len(inp) or elt < 0:
            raise ValueError("Axis {} too large for tensor of order {}".format(elt, len(inp)))
        result[elt] = None

    return list(filter(lambda x: x is not None, result))


def _as_list(x):
    if type(x) is list:
        return x
    else:
        return [x]


def tensor_sum(t1, axis=None, out_format=default_mode, dtype=None):
    """
        Sums a tensor along a specified axis or axes of a tensor.

        Parameters
        ------------
        t1: tensor, array_like
            The elements to sum.

        axis: None, int or tuple of ints, optional
            Axis or axes along which the sum is performed. If the axis is a tuple of ints, the sum is performed on all
            of the specified axes.

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

        Examples
        ----------
        The sum of an empty element is 0.

        >>> import pytaco as pt
        >>> pt.tensor_sum([])[0]
        0.0

        >>> pt.tensor_sum([0.5, 1.5])[0]
        2.0
        >>> pt.tensor_sum([0.5, 0.7, 0.2, 1.5], dtype=pt.int32)[0]
        1
        >>> pt.tensor_sum([[0, 1], [0, 5]])[0]
        6
        >>> pt.tensor_sum([[0, 1], [0, 5]], axis=0).to_array()
        array([0, 6], dtype=int64)
        >>> pt.tensor_sum([[0, 1], [0, 5]], axis=1).to_array()
        array([1, 5], dtype=int64)

        Returns
        --------
        summed_axes: tensor
            A tensor with the same dimensions as t1 with all of the specified axes removed. If axis is None, a scalar
            tensor is returned.

    """
    t1 = as_tensor(t1, False)

    out_dtype = t1.dtype if dtype is None else dtype
    res_shape = [] if axis is None else _remove_elts_at_index(t1.shape, _as_list(axis))

    inp_index_vars = _cm.get_index_vars(t1.order)
    out_index_vars = [] if axis is None else _remove_elts_at_index(inp_index_vars, _as_list(axis))

    result_tensor = tensor(res_shape, fmt=out_format, dtype=out_dtype)
    result_tensor[out_index_vars] = t1[inp_index_vars]
    return result_tensor


def as_type(t, type):
    """
        Converts a tensor from one type to another.

        Parameters
        -----------
        t: tensor
            Tensor to convert

        type: :class:`dtype`
            The data type of the new tensor


        Examples
        ----------
        >>> import pytaco as pt
        >>> t = pt.tensor(100, dtype=pt.double)
        >>> pt.as_type(t, pt.int8).dtype
        pytaco.int8_t

        Returns
        ---------
        new_t: tensor
            A new tensor new_t with datatype type where all elements in t are cast to type.

    """

    vars = _cm.get_index_vars(t.order)
    new_t = tensor(t.shape, t.format, type)
    new_t[vars] = t[vars]
    return new_t


def _matrix_out_shape(shape1, shape2):
    if len(shape1) < 2 or len(shape2) < 2:
        raise ValueError("Invalid tensor order for matrix multiply. Must be at least order 2 but operand1 has "
                         "order {} while operand 2 has order {}.".format(len(shape1), len(shape2)))

    if shape1[-1] != shape2[-2]:
        raise ValueError("Input operand1 has value {} in dimension 0 while operand2 has value {} in dimension 1. "
                         "These two dimensions must be equal".format(shape1[-1], shape2[-2]))

    if not _is_broadcastable(shape1[:-2], shape2[:-2]):
        raise ValueError("Cannot broadcast outer tensor elements. All the leading dimensions must be equal.")

    result_shape = shape1[:-2] if len(shape1) >= len(shape2) else shape2[:-2]
    result_shape.extend([shape1[-2], shape2[-1]])
    return result_shape


def matmul(t1, t2, out_format=default_mode, dtype=None):
    """
        Matrix multiplication of two tensors.

        Parameters
        -----------
        t1, t2: tensors, array_like
            The tensors to multiply. Tensors must be at least 2-D.

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.


        Notes
        ------
        * When both arguments are 2-D conventional matrix multiply is performed.

        * If either argument is N-D N > 2, the tensor is treated as a stack of matrices and is broadcasted in this
          manner.

        Examples
        ---------
        Here we demonstrate broadcasting for a 3-D tensor. We use numpy arrays for demonstration due to easy data
        generation. However, we could have given sparse tensors of any format for taco to compute. Note that the
        choice of a tensor format has a big effect on the final performance. For instance it is favorable to multiply
        CSR matrices with CSC since dot products can be easily computed.

        >>> import pytaco as pt
        >>> import numpy as np
        >>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))
        >>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))
        >>> pt.matmul(a,b).shape
        [2, 2, 2]
        >>> pt.matmul(a, b)[0, 1, 1]
        98

        Returns
        --------
        res: tensor
            The matrix product of the input tensors.


    """
    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)

    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype
    out_shape = _matrix_out_shape(t1.shape, t2.shape)
    result_tensor = tensor(out_shape, fmt=out_format, dtype=out_dtype)

    reduction_var = _cm.index_var()
    leading_vars = _cm.get_index_vars(max(t1.order, t2.order))
    t1_vars, t2_vars = leading_vars[-t1.order:], leading_vars[-t2.order:]
    t1_vars[-1] = reduction_var
    t2_vars[-2] = reduction_var
    result_tensor[leading_vars] = t1[t1_vars] * t2[t2_vars]
    return result_tensor


def inner(t1, t2, out_format=default_mode, dtype=None):
    """
        The inner product of two arrays.

        In general, this is a sum product over the last dimensions of the two inputs. If a or b is a scalar,
        element-wise multiplication is performed.


        Parameters
        -----------
        t1, t2: tensors, array_like
            The tensors to take the inner product of. If non-scalar, the size of their last dimensions must match.

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.


        Notes
        -------

        For 3-D tensors, this is equivalent to writing ``A[i, j] = B[i, j, k] * C[i, j, k]``. We can also explicitly
        write ``A[i, j] = sum(B[i, j, k] * C[i, j, k], k)`` assuming the tensors and index variables have been defined.

        Examples
        ----------
        >>> import pytaco as pt
        >>> import numpy as np
        >>> a = np.arange(24).reshape((2,3,4))
        >>> b = np.arange(4)
        >>> pt.inner(a, b).to_array()
        array([[ 14,  38,  62],
               [ 86, 110, 134]], dtype=int64)


        We could perform the same computations with sparse tensors of any format.


        Returns
        ---------
        res: tensor
            The inner product of the tensors passed in.
    """
    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)

    if t1.order == 0 or t2.order == 0:
        return tensor_mul(t1, t2, out_format, dtype)

    if t1.order != 0 and t2.order != 0 and t1.shape[-1] != t2.shape[-1]:
        raise ValueError("Last dimensions of t1 and t2 must be equal but t1 has dimension {} while "
                         "t2 has dimension %".format(t1.shape[-1], t2.shape[-1]))

    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype
    out_shape = t1.shape[:-1] + t2.shape[:-1]

    reduction_var = _cm.index_var()
    t1_vars, t2_vars = _cm.get_index_vars(len(t1.shape[:-1])), _cm.get_index_vars(len(t2.shape[:-1]))
    result_vars = t1_vars + t2_vars

    t1_vars.append(reduction_var)
    t2_vars.append(reduction_var)

    result_tensor = tensor(out_shape, out_format, dtype=out_dtype)
    result_tensor[result_vars] = t1[t1_vars] * t2[t2_vars]
    return result_tensor


def _dot_output_shape(shape1, shape2):

    if shape1[-1] != shape2[-2]:
        raise ValueError("Input operand1 has value {} in dimension 0 while operand2 has value {} in dimension 1."
                         " These two dimensions must be equal".format(shape1[-1], shape2[-2]))

    return shape1[:-1] + shape2[:-2] + shape2[-1:]


def dot(t1, t2, out_format=default_mode, dtype=None):
    """
    The dot product of two tensors.

    This implements the same spec as numpy but allows for sparse taco tensors as operands.

    * If both t1 and t2 are 1-D then this is an inner product.

    * If either operand is a scalar, this is equivalent to element-wise multiply.

    * Equivalent to matrix multiplication if both tensors are 2-D

    * If t1 is N-D and t2 is 1-D,t is a sum product over the last axis of t1 and t2

    * if t1 is an N-D array and t2 is an M-D array (where M>=2), it is a sum product over the last axis of t1 and the
      second-to-last axis of t2:


    We could write this using index expressions for two 3-D tensors as:
    ``T[i, j, k, m] = t1[i, j, l] * t2[k, l, m]``

    In the above, T = dot(t1, t2)

    Parameters
    ------------
    t1, t2: tensors, array_like
        The arguments to dot. The side of the last dimension of t1 must be equal to the size of the second to last
        dimension of t2.

    out_format: format, mode_format, optional
        * If a :class:`format` is specified, the result tensor is stored in the format out_format.
        * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
          stored in the :class:`mode_format` passed in.

    dtype: Datatype
        The datatype of the output tensor.


    Examples
    ---------
    >>> import pytaco as pt
    >>> pt.dot(3, 4)[0]
    12.0
    >>> a = [[1, 0], [0, 1]]
    >>> b = [[4, 1], [2, 2]]
    >>> pt.dot(a, b).to_array()
    array([[4, 1],
           [2, 2]], dtype=int64)

    Returns
    --------
    res: tensor
        The dot product of the input tensors.


    """
    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)
    if t1.order == 0 or t2.order <= 1:
        return inner(t1, t2, out_format, dtype)

    # Here we know that a is a non-scalar and b has order at least 2
    out_shape = _dot_output_shape(t1.shape, t2.shape)
    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype

    t1_vars = _cm.get_index_vars(t1.order - 1)
    t2_vars = _cm.get_index_vars(t2.order - 1)
    res_vars = t1_vars + t2_vars

    reduction_var = _cm.index_var()
    t1_vars.append(reduction_var)
    t2_vars = t2_vars[:-1] + [reduction_var] + t2_vars[-1:]
    result_tensor = tensor(out_shape, fmt=out_format, dtype=out_dtype)
    result_tensor[res_vars] = t1[t1_vars] * t2[t2_vars]
    return result_tensor


def outer(t1, t2, out_format=default_mode, dtype=None):
    """
        Computes the outer product of two vectors.

        Parameters
        -----------
        t1, t2: tensor, array_like
            The input vectors. If the input is not a vector an error is raised.

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.


        Examples
        -----------
        >>> import numpy as np
        >>> import pytaco as pt
        >>> a = pt.from_array(np.arange(3))
        >>> pt.outer(a, a).to_array()
        array([[0, 0, 0],
               [0, 1, 2],
               [0, 2, 4]], dtype=int64)



        Returns
        --------
        outer: tensor
            The outer product of the tensors passed in.



    """
    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)
    t1_order = t1.order
    t2_order = t2.order
    if t1_order == 0 or t2_order == 0:
        return tensor_mul(t1, t2, out_format, dtype)

    if t1_order != 1:
        raise ValueError("Can only perform outer product with vectors and scalars but first "
                         "operand has {} dimensions.".format(t1.order))

    if t2_order != 1:
        raise ValueError("Can only perform outer product with vectors and scalars but second "
                         "operand has {} dimensions.".format(t2.order))

    out_shape = t1.shape + t2.shape
    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype
    out_vars = _cm.get_index_vars(2)

    result_tensor = tensor(out_shape, fmt=out_format, dtype=out_dtype)
    result_tensor[out_vars] = t1[out_vars[0]] * t2[out_vars[1]]
    return result_tensor


def tensordot(t1, t2, axes=2, out_format=default_mode, dtype = None):
    """
        Compute the tensor dot product along the specified axes for tensors of order >= 1.

        Given two tensors and an iterable containing two iterables ``(t1_axes, t2_axes)``, sum the products
        of t1 and t2 elements over the axes specified by ``t1_axes`` and ``t2_axes``. The third (axes) argument
        can be a non-negative integer scalar such as N. In this case, the last ``N`` dimensions of t1 and the first
        ``N`` dimensions of t2 are summed over.

        Parameters
        ------------
        t1, t2: tensor, array_like, dimensions >= 1
            The two tensors we would like to dot.

        axes: int or array_like
            * If an integer, sum over the last N axes of t1 and the first N axes of t2. The size of the dimensions must
              match.

            * If array_like or a list of axes to sum over, the first iterable is the list of axes applying to t1 and the
              second iterable is a list of axes applying to t2

        out_format: format, mode_format, optional
            * If a :class:`format` is specified, the result tensor is stored in the format out_format.
            * If a :class:`mode_format` is specified, the result the result tensor has a with all of the dimensions
              stored in the :class:`mode_format` passed in.

        dtype: Datatype
            The datatype of the output tensor.

         Notes
         ------
         When there is more than one axis to sum over - and they are not the last axis of t1 and the first axis of t2,
         the argument axes should consist of two sequences of the same length, with the first axis to sum over given
         first in both sequences, the second axis second, and so forth.

         As with all expressions, some sparse tensors cannot be computed without explicity transposing  because taco is
         unable to simultaneously iterate over both tensors. In this case, an error will be thrown asking that the user
         transposes one of the tensors.

         Examples
         -----------
         Here we use an example where we show using the axes as a list. We 'dot' the 1st axis of t1 with the 0-th axis
         of t2 and the 0th axis of t1 with the 1st axis of t2.

         .. doctest::

            >>> import pytaco as pt
            >>> from pytaco import compressed, dense
            >>> import numpy as np
            >>> from scipy.sparse import csr_matrix, csc_matrix
            >>> t1 = np.triu(np.arange(60.).reshape(3,4,5))
            >>> t2 = np.triu(np.arange(24.).reshape(4,3,2))
            >>> res = pt.tensordot(t1,t2, axes=([1,0],[0,1]))
            >>> res.shape
            [5, 2]
            >>> res.to_array()
            array([[   0.,   60.],
                   [  36.,  340.],
                   [ 186.,  996.],
                   [ 528., 2184.],
                   [ 564., 2272.]])

            # We can use sparse tensors to do the same thing
            >>> fmt1 = pt.format([compressed, dense, compressed])
            >>> fmt2 = pt.format([compressed, dense, compressed], [1, 0, 2])
            >>> t3 = pt.remove_explicit_zeros(t1, fmt1)
            >>> t4 = pt.remove_explicit_zeros(t2, fmt2)
            >>> res2 = pt.tensordot(t3,t4, axes=([1, 0], [0, 1]), out_format=dense)
            >>> res2.to_array()
            array([[   0.,   60.],
                   [  36.,  340.],
                   [ 186.,  996.],
                   [ 528., 2184.],
                   [ 564., 2272.]])

         Note that in the above, we had to switch the mode ordering in order to allow taco to perform the computation.
         In general, when dealing with sparse structures, assuming our axes are in the form ``(a_axes, b_axes)`` we want to
         ensure that ``a_axes[i] == b_axes[mode_ordering[i]]``. For the curious, we are currently exploring ways to lift
         this restriction.

         Also note that the above example is equivalent to writing ``Res[k, l] = t3[j, i, k] * t4[i, j, l]`` in index
         notation.


         Returns
         --------
         result: tensor
            The contraction of the two tensors passed in based on the axes given.

    """

    # This is largely adapted from numpy's tensordot source code
    t1, t2 = as_tensor(t1, False), as_tensor(t2, False)
    try:
        iter(axes)
    except Exception:
        axes_t1 = list(range(-axes, 0))
        axes_t2 = list(range(0, axes))
    else:
        axes_t1, axes_t2 = axes

    try:
        nt1 = len(axes_t1)
        axes_t1 = list(axes_t1)
    except TypeError:
        axes_t1 = [axes_t1]
        nt1 = 1

    try:
        nt2 = len(axes_t2)
        axes_t2 = list(axes_t2)
    except TypeError:
        axes_t2 = [axes_t2]
        nt2 = 1

    t1_shape = t1.shape
    t1_order = t1.order
    t2_shape = t2.shape
    t2_order = t2.order

    equal = True

    if nt1 != nt2:
        equal = False
    else:
        for k in range(nt1):
            if t1_shape[axes_t1[k]] != t2_shape[axes_t2[k]]:
                equal = False
                break
            if axes_t1[k] < 0:
                axes_t1[k] += t1_order
            if axes_t2[k] < 0:
                axes_t2[k] += t2_order
    if not equal:
        raise ValueError("shape-mismatch for sum")

    notin_t1 = [k for k in range(t1_order) if k not in axes_t1]
    out_shape_t1 = [t1_shape[axis] for axis in notin_t1]

    notin_t2 = [k for k in range(t2_order) if k not in axes_t2]
    out_shape_t2 = [t2_shape[axis] for axis in notin_t2]

    static_vars_t1 = _cm.get_index_vars(len(notin_t1))
    static_vars_t2 = _cm.get_index_vars(len(notin_t2))
    reduction_vars = _cm.get_index_vars(nt1)

    t1_vars = [None] * t1_order
    t2_vars = [None] * t2_order
    for k in range(len(reduction_vars)):
        t1_vars[axes_t1[k]] = reduction_vars[k]
        t2_vars[axes_t2[k]] = reduction_vars[k]

    for k in range(len(static_vars_t1)):
        t1_vars[notin_t1[k]] = static_vars_t1[k]

    for k in range(len(static_vars_t2)):
        t2_vars[notin_t2[k]] = static_vars_t2[k]

    out_vars = static_vars_t1 + static_vars_t2

    out_shape = out_shape_t1 + out_shape_t2
    out_dtype = _cm.max_type(t1.dtype, t2.dtype) if dtype is None else dtype
    result_tensor = tensor(out_shape, fmt=out_format, dtype=out_dtype)
    result_tensor[out_vars] = t1[t1_vars] * t2[t2_vars]
    return result_tensor


def evaluate(expr, *operands, out_format=None, dtype=None):
    """
    Evaluates the index notation expression on the input operands.

    An output tensor may be optionally specified. In this case, the tensor should be given the expected output shape,
    format and dtype since the out_format and dtype fields will be ignored if an output tensor is seen.

    Parameters
    ------------
    expr: str
        Specifies an index expression as a string. This must be of the form ``res(i1, i2, ...) = expr``. See the
        examples for a more specific example. Each object represented by a name in the string and is indexed by a
        variable for each dimension.

    operands: list of tensors or array_like
        Specifies the input tensors OR the input and output tensor. If the length of the list is equal to N - 1 where
        N is the number of terms in the input expression then it is assumed that no output tensor was specified and
        taco infers the output shape and uses the out_format and dtype passed in for the output tensor. If the
        length of the operands is equal to N, then the first tensor is assumed to be the output tensor and the
        out_format and dtype fields are ignored.

    out_format: format, optional
        The storage format of the output tensor if one was not explicitly provided. If left to None and no output
        tensor was provided then all modes default to dense.

    dtype: datatype
        The datatype of the output tensor. If left to None and no output tensor was explicitly specified then taco uses
        its promotion rules and sets the output to the datatype with the highest type.

    Notes
    -------
    This provides a convenient way to express tensor expressions. It is identical to the Index Expression syntax with
    a few exceptions. There is no need to use ``t[None]`` when making expressions with scalars and square brackets are
    replaced with parenthesis. For example, in python we can represent matrix multiply as
    ``A[i, j] = B[i, k] * C[k, j]`` while the corresponding tensor expression would be ``A(i, j) = B(i, k) * C(k, j)``.
    Further, reductions in pythonic index expression notation would be expressed as ``A[None] = B[i, j]`` to sum all
    the elements of a matrix while the corresponding string would be ``A = B(i, j)``.


    The string parser currently only supports +, -, / and *. Thus, expressions involving other functions such as exp,
    tan etc, would have to be written using the pythonic expressions.

    An input tensor is recognised by the parser by a name followed by a comma separated list of index variables in
    parenthesis. Thus ``A(i,j,k)`` represents an order 3 tensor with the name A. The names used in the expression are
    irrelevant since taco will match the operands with the terms in the expression in the same order they appear
    (which is why when specifying an output, the output tensor must appear first followed by its input).

    As with index expressions, index variables appearing that are on the right hand side of the expression but not
    in the result are always summed.

    Examples
    ----------

    .. doctest::

        >>> import numpy as np
        >>> import pytaco as pt
        >>> a = np.arange(25).reshape(5, 5)
        >>> t = pt.tensor([5, 5], pt.csr)
        >>> for i in range(5): t.insert([i, i], a[i, i])
        >>> vec = np.arange(5)

        # Note that no output is specified.
        # We can sum over any of the axes of the sparse tensor as follows:
        >>> pt.evaluate("T(j) = A(i, j)", t).to_array() # defaults to dense vector
        array([ 0.,  6., 12., 18., 24.], dtype=float32)

        # Specify an output
        >>> result = pt.tensor([5], pt.dense)
        >>> pt.evaluate("T(j) = A(i, j)", result, t).to_array()
        array([ 0.,  6., 12., 18., 24.], dtype=float32)
        >>> result.to_array()
        array([ 0.,  6., 12., 18., 24.], dtype=float32)

        # We can perform addition and broadcast along a given axis
        >>> pt.evaluate("T(i, j) = A(i, j) + B(j)", t, vec, out_format=pt.csr).to_array()
        array([[ 0.,  1.,  2.,  3.,  4.],
               [ 0.,  7.,  2.,  3.,  4.],
               [ 0.,  1., 14.,  3.,  4.],
               [ 0.,  1.,  2., 21.,  4.],
               [ 0.,  1.,  2.,  3., 28.]], dtype=float32)


        # Create a SpMV kernel (since t is csr)
        >>> pt.evaluate("A(j) = M(i, j) * V(j)", t, vec).to_array()
        array([ 0.,  6., 24., 54., 96.], dtype=float32)

        # Sum tensor elements, note that names used don't matter
        >>> pt.evaluate("S = C(i, j)", t)[0]
        60.0

    Examples of reductions along with computations. Note indices that appear of the right hand side but not
    on the left hand side get summed over. This means we can implement matrix multiplication as shown below:

    .. doctest::

        >>> from scipy.sparse import csc_matrix
        >>> mat  = np.arange(9).reshape(3, 3)
        >>> mat2 = csc_matrix(np.triu(np.arange(6).reshape(3, 2)))

        # Compute mat @ mat2 due to ordering of operands.
        >>> res = pt.evaluate("T(i, j) = A(i, k) * B(k, j)", mat, mat2, out_format=pt.csr)
        >>> numpy_res = np.matmul(mat, mat2.toarray())
        >>> all(res == numpy_res)
        True


    Returns
    ---------
    output: tensor
        The tensor calculated based on the string expression passed in. Even if taco detects that an output is
        specified, it will still return a reference to that tensor.

    """

    args = [as_tensor(t, False) for t in operands]
    if len(args) == 0:
        raise ValueError("Expression must have at least one operand on the LHS and one on the RHS.")

    out_dtype = args[0].dtype if dtype is None else dtype
    if dtype is None:
        for i in range(1, len(args)):
            out_dtype = _cm.max_type(out_dtype, args[i].dtype)

    tensor_base = _cm._parse(expr, [t._tensor for t in args], out_format, out_dtype)
    return tensor.from_tensor_base(tensor_base)


def einsum(expr, *operands, out_format=None, dtype=None):
    """
    Evaluates the Einstein summation convention on the input operands.

    The einsum summation convention employed here is very similar to `numpy's
    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html#numpy.einsum>`_ but has some differences
    explained below.

    Taco's einsum can express a wide variety of linear algebra expressions in a simple fashion. Einsum can be used
    in implicit mode where no output indices are specified. In this mode, it follows the usual einstein summation
    convention to compute an output. In explicit mode, the user can force summation over specified subscript variables.

    Note that this einsum parser is a subset of what taco can express. The full :func:`~evaluate` supports a much
    larger range of possible expressions.

    See the notes section for more details.

    Warnings
    ----------
    This differs from numpy's einsum in two important ways. The first is that the same subscript cannot appear more than
    once in a given operand. The second is that for sparse tensors, some expressions may require the user to explicitly
    transpose the tensors before passing them into einsum.

    Parameters
    ------------
    expr: str
        Specifies the subscripts for summation as a comma separated list of subscript variables. An implicit (Classical
        Einstein summation) is calculation is performed unless there is an explicit indicator '->' included along with
        subscript labels specifying the output.

    operands: list of array_like, tensors, scipy csr and scipy csc matrices
        This specifies the operands for the computation. Taco will copy any numpy arrays that are not stored in
        row-major or column-major format.

    out_format: format, optional
        The storage :class:`format` of the output tensor. Note that unlike the other tensor functions the format here
        must be an actual format instance and cannot be a mode format. If the out_format is left to none, all the modes
        default to dense.

     dtype: datatype, optional
        The datatype of the output tensor.


    See also
    ----------
    :func:`evaluate`

    Notes
    --------

    `einsum` provides a succint way to represent a large number of tensor algebra expressions. A list of some possible
    operations along with some examples is presented below:

    * Sum axes of tensor :func:`~tensor_sum`
    * Transpose tensors or transpose to dense tensors :func:`tensor.transpose`
    * Matrix multiplication and dot products :func:`dot` and :func:`matmul`
    * Tensor contractions :func:`tensordot`

    The expr string is a comma separated list of subscript labels where each label corresponds to a dimension in the
    tensor. In implicit mode, repeated subscripts are summed. This means that ``pt.einsum('ij,jk', t1, t2)`` is matrix
    multiplication since the j indices are implicitly summed over.

    In explicit mode, we can specify the output directly by using the '->' identifier along with a list of the output
    labels. This means that expressions such as ``pt.einsum('i->', t)`` take the sum of all the elements in the vector
    and return a scalar. The above is equivalent to ``pt.tensor_sum(t)``

    `einsum` also allows for broadcasting using ellipses. For instance, to do a matrix multiply with only the
    right most dimensions of an order tensor, we can write ``pt.einsum("...ik,...kj->...ij")``

    Examples
    -----------
    .. doctest::

        >>> import numpy as np
        >>> import pytaco as pt
        >>> a = np.arange(25).reshape(5, 5)
        >>> t = pt.tensor([5, 5], pt.csr)
        >>> for i in range(5): t.insert([i, i], a[i, i])
        >>> vec = np.arange(5)

        # We can sum over any of the axes of the sparse tensor as follows:
        >>> pt.einsum("ij->j", t).to_array() # defaults to dense vector
        array([ 0.,  6., 12., 18., 24.], dtype=float32)

        >>> pt.tensor_sum(t, axis=1).to_array()
        array([ 0.,  6., 12., 18., 24.], dtype=float32)

        # We can perform matrix multiply in implicit mode
        # Note that is Sparse x Dense matrix multiply due to the formats

        >>> pt.einsum("ij,jk", t, a, out_format=pt.csr).to_array()
        array([[  0.,   0.,   0.,   0.,   0.],
               [ 30.,  36.,  42.,  48.,  54.],
               [120., 132., 144., 156., 168.],
               [270., 288., 306., 324., 342.],
               [480., 504., 528., 552., 576.]], dtype=float32)

        # Create a SpMV kernel (since t is csr)
        >>> pt.einsum("ij,j->j", t, vec).to_array()
        array([ 0.,  6., 24., 54., 96.], dtype=float32)

        # Equivalently, we could write
        >>> np.array(pt.inner(t, vec, pt.dense), copy=False)
        array([ 0.,  6., 24., 54., 96.], dtype=float32)

        # Sum tensor elements
        >>> pt.einsum("ij->", t)[0]
        60.0
        >>> pt.tensor_sum(t)[0]
        60.0

    The `numpy docs <https://docs.scipy.org/doc/numpy/reference/generated/numpy.einsum.html#numpy.einsum>`_ contain
    more examples and an in depth explanation of this notation can be found
    `here <https://rockt.github.io/2018/04/30/einsum>`_.

    Returns
    --------
    out: tensor
        The result of the einstein summation computation.

    """

    args = [as_tensor(t, False) for t in operands]
    out_dtype = args[0].dtype if dtype is None else dtype
    if dtype is None:
        for i in range(1, len(args)):
            out_dtype = _cm.max_type(out_dtype, args[i].dtype)

    ein = _cm._einsum(expr, [t._tensor for t in args], out_format, out_dtype)
    return tensor.from_tensor_base(ein)


def apply(func_name, arg_list, output_zero_specifier):
    """
        Applies a user defined function to an :class:`index_expression`.

        Parameters
        ------------
        func_name: str
            The name of the function to call from the udf dir.

        arg_list: list of index_expression
            The arguments that the function ``func_name`` should be called with. This should be index expressions
            corresponding to the number of arguments ``func_name`` expects.

        output_zero_specifier: list of ints:
            This specifies that if all inputs in corresponding to ``arg_list[output_zero_specifier[i]``] are 0, then
            the output of the entire function should be 0. If this list is empty, then this means the function always
            outputs a non-zero (as is the case with exp). If this list is ``[0]``, this means that  whenever the first
            argument is 0, the output should be 0. A list of ``[1]`` implies whenever the second argument is 0, the
            output should be 0 and so on. ``[0, 1]`` means that if both inputs to the function are 0, then a zero output
            is expected.



        Notes
        -------
        This function searches in the directory specified by calling :func:`set_udf_dir` for a C header that declares 
        and implements a function with name  ``func_name``. If that function is found, taco will attempt to use that function 
        during compilation to perform computations on the expressions in ``arg_list``. The function must be implemented in C99.

    """
    raise NotImplementedError


def set_udf_dir(dir_to_search):
    """
        Sets the directory to search for user defined functions.

        Parameters
        ------------
        dir_to_search: str
            The directory that taco should search when looking for user defined functions.
    """
    raise NotImplementedError
